Step 1: Load filtered features, barcode, matrices and visualise quality

In this experiment, we had only 1 library, called “D33_D45_D47_CD4_Tcells”.

The hastag antibodies used were:

# lib name
hto_used = samplesheet %>%
    filter(Library_name == lib_name) %>%
    pull(Antibody) %>%
    unique

hto_used
##  [1] "TotalSeq_C_2"  "TotalSeq_C_5"  "TotalSeq_C_6"  "TotalSeq_C_7" 
##  [5] "TotalSeq_C_8"  "TotalSeq_C_9"  "TotalSeq_C_10" "TotalSeq_C_11"
##  [9] "TotalSeq_C_12" "TotalSeq_C_3"  "TotalSeq_C_4"

Number of features (genes) and samples (cells) in this library

# Load the PBMC dataset
#data <- Read10X(data.dir = paste0("../counts/",lib_name,"/filtered_feature_bc_matrix/"))
# for CellBender, do on cmd:
# ptrepack --complevel 5 output.h5:/matrix output_filtered_seurat.h5:/matrix

# output from Cellbender:
lib_name = "D33_D45_D47_CD4_Tcells"
#data <- Read10X(data.dir = paste0("../Data/",lib_name,"/sample_filtered_feature_bc_matrix/"))
data <- Read10X_h5(filename = "../Data/D33_D45_D47_CD4_Tcells/h5/output_filtered_seurat.h5")
gex <- data[[1]]
hto <- data[["Antibody Capture"]][rownames(data[["Antibody Capture"]]) %in% hto_used, ]

# remove SCoV2 genes
# plasmo.gex <- gex[grep(rownames(gex), pattern = "^gene-PF3D7"),] # assay with Plasmodium genes
# human.gex <- gex[grep(rownames(gex), pattern = "^gene-PF3D7", invert = T),]


# Initialize the Seurat object with the raw (non-normalized data).
#mono <- CreateSeuratObject(counts = gex, project = lib_name, min.cells = 3, min.features = 200)
options(Seurat.object.assay.calcn = TRUE)
hustle <- CreateSeuratObject(counts = gex, project = lib_name, min.cells = 3, min.features = 10)
hustle
## An object of class Seurat 
## 14430 features across 4322 samples within 1 assay 
## Active assay: RNA (14430 features, 0 variable features)
##  1 layer present: counts
hustle[["percent.mt"]] <- PercentageFeatureSet(hustle, pattern = "^MT-")
# ribo genes
ribo.genes <- grep("^RP[SL][[:digit:]]|^RPLP[[:digit:]]|^RPSA",rownames(hustle),value = TRUE)
ribo.genes <- grep("[BK-]|AP|[[:digit:]](.+)L",ribo.genes,value = TRUE, invert = T)
# percent.ribo <- colSums(hustle[ribo.genes, ])/colSums(hustle)
# hustle <- AddMetaData(object = hustle, metadata = percent.ribo, col.name = "percent.ribo")
hustle[["percent.ribo"]] <- PercentageFeatureSet(object = hustle, features = ribo.genes)
head(hustle@meta.data, 5)

Demultiplex: Assign donor to cells - singlets, doublets, negatives using vireo

  1. match donor ID with samples (cells)
# install / upgrade vireoSNP using pip install --upgrade --no-deps vireoSNP
# vireo to check installation errors
# download all cellSNP file from library -> save vcf file as .gz, leave others un-gzipped
# error: index out of range -> solved by downloading the files again and redoing vireo
# command line: vireo -c cellSNP_mat/ -N 3 -o vireo_result/ for 3 donors

# add vireo output to meta data
hustle.demul <- hustle
snp <- read.delim(paste("../Data/",lib_name,"/vireo_result/donor_ids.tsv", sep = ""))
meta <- hustle.demul@meta.data %>% 
  tibble::rownames_to_column("cell") %>% 
  left_join(snp)

table(meta$donor_id)
## 
##     donor0     donor1     donor2    doublet unassigned 
##        265        340       1638        106        192
# remove doublets and unassigned
hustle.demul@meta.data <- meta

hustle.demul@meta.data <- hustle.demul@meta.data %>% 
  as.data.frame() %>% 
  tibble::column_to_rownames("cell")

## didn't need the next line for CellBender output:
Idents(hustle.demul) <- "donor_id"
hustle.demul <- subset(x = hustle.demul, idents = c("doublet", "unassigned"), invert = T)

hustle <- hustle.demul

# identical(meta$cell, rownames(mono@meta.data))
# [1] TRUE
  1. Demultiplexing with hastag oligos -> identify cells with tagged barcodes (# of barcodes (Ab) = # of donors)
#Cell Hashing uses oligo-tagged antibodies against ubuquitously expressed surface proteins to place a “sample barcode” on each single cell, enabling different samples to be multiplexed together and run in a single experiment.

# Select cell barcodes detected by both RNA and HTO In the example datasets we have already
# filtered the cells for you, but perform this step for clarity.

hustle.singlet.donor <- hustle
joint.bcs <- intersect(rownames(hustle.singlet.donor@meta.data), colnames(hto))

# identical(colnames(pbmc@assays$RNA), rownames(pbmc@meta.data))
# [1] TRUE

# Subset RNA and HTO counts by joint cell barcodes
#pbmc@assays$RNA <- pbmc@assays$RNA[, joint.bcs]
hustle.hto <- as.matrix(hto[,joint.bcs])

## # Add HTO data as a new assay independent from RNA
Idents(hustle.singlet.donor) <- rownames(hustle.singlet.donor@meta.data)
hustle.singlet.donor[["HTO"]] <- CreateAssayObject(counts = hustle.hto)

# Normalize HTO data, here we use centered log-ratio (CLR) transformation
hustle.singlet.donor <- NormalizeData(hustle.singlet.donor, assay = "HTO", normalization.method = "CLR")

# If you have a very large dataset we suggest using k_function = 'clara'. This is a k-medoid
# clustering function for large applications You can also play with additional parameters (see
# documentation for HTODemux()) to adjust the threshold for classification Here we are using
# the default settings
hustle.singlet.donor <- HTODemux(hustle.singlet.donor, assay = "HTO", positive.quantile = 0.99) #, positive.quantile = 0.99
#hustle.singlet.donor <- MULTIseqDemux(hustle.singlet.donor, assay = "HTO")
# Global classification results
table(hustle.singlet.donor$HTO_classification.global)
## 
##  Doublet Negative  Singlet 
##     1099      462     2463
#table(hustle.singlet.donor$MULTI_classification)

Idents(hustle.singlet.donor) <- "HTO_classification.global"
#Idents(hustle.singlet.donor) <- "MULTI_classification"
VlnPlot(hustle.singlet.donor, features = c("nFeature_RNA", "nCount_RNA"), pt.size = 0.1, log = F)

#VlnPlot(hustle.singlet.donor, features = c("percent.mt", "percent.plasmo"), pt.size = 0.1, log = TRUE)
hustle.h <- hustle.singlet.donor

# First, we will remove negative cells from the object
# if(any(hustle.h$MULTI_classification == "Negative") == T)
#   hustle.h <- subset(hustle.h, idents = c("Negative"), invert = TRUE)

if(any(hustle.h$HTO_classification.global == "Negative") == T)
   hustle.h <- subset(hustle.h, idents = c("Negative"), invert = TRUE)

# Calculate a tSNE embedding of the HTO data
DefaultAssay(hustle.h) <- "HTO"
hustle.h <- ScaleData(hustle.h, features = rownames(hustle.h),
    verbose = FALSE)
hustle.h <- RunPCA(hustle.h, features = rownames(hustle.h), approx = FALSE)
hustle.h <- RunTSNE(hustle.h, check_duplicates = F)
Idents(hustle.h) <- 'HTO_classification'
#Idents(hustle.h) <- 'MULTI_classification'
#DimPlot(hustle.h)



# To increase the efficiency of plotting, you can subsample cells using the num.cells argument
#HTOHeatmap(hustle, assay = "HTO", ncells = 500)

Features and samples after assigning cells to donors and retaining only singlets

Idents(hustle.singlet.donor) = "HTO_classification"
# Extract singlets
#hashtags <- gsub("Total_Seq", "TotalSeq", hto_used)
hashtags <- gsub("_", "-", hto_used)

# hustle.singlet.donor@meta.data <- hustle.singlet.donor@meta.data %>%
#   mutate(Cell_state = case_when(
#     as.character(MULTI_classification) %in% hashtags ~ "Singlet",
#     .default = MULTI_classification
#   ))

#Idents(hustle.singlet.donor) <- "Cell_state"

#hustle.singlet <- subset(hustle.singlet.donor, subset = Cell_state == "Singlet")

# For HTODemux:
hustle.singlet.donor = hustle.h
Idents(hustle.singlet.donor) = "HTO_classification.global"
hustle.singlet <- subset(hustle.singlet.donor, idents = "Singlet")
hustle.singlet
## An object of class Seurat 
## 14441 features across 2463 samples within 2 assays 
## Active assay: HTO (11 features, 0 variable features)
##  3 layers present: counts, data, scale.data
##  1 other assay present: RNA
##  2 dimensional reductions calculated: pca, tsne
# donorid <- samplesheet %>%
#   filter(Library_name == lib_name) %>%
#   dplyr::rename(HTO_classification = Antibody) %>%
#   #mutate(HTO_classification = gsub("_", "-", HTO_classification)) %>%
#   dplyr::rename(Group = `Patient Group`) %>%
#   select(Group, HTO_classification) %>%
#   distinct()

# donorid <- samplesheet %>%
#   filter(Library_name == lib_name) %>%
#   dplyr::rename(MULTI_classification = Antibody) %>%
#   mutate(MULTI_classification = gsub("_", "-", MULTI_classification)) %>%
#   dplyr::rename(Group = `Patient Group`) %>%
#   select(Group, MULTI_classification) %>%
#   distinct()

hustle.demul <- hustle.singlet
# hustle.demul@meta.data <- hustle.demul@meta.data %>% 
#   left_join(., donorid)

rownames(hustle.demul@meta.data) <- rownames(hustle.singlet@meta.data)#rownames(hustle.singlet@meta.data)
hustle <- hustle.singlet

QC for number of RNA molecules (nCount_RNA), number of features (nFeature_RNA), mitochondrial RNA percent denoting dead cells (percent.mt) and ribosomal genes percentage (NA cells because of unmatched CellBender cell filtration)

# Visualize QC metrics as a violin plot
DefaultAssay(hustle) <- "RNA"
Idents(hustle) <- "orig.ident"
VlnPlot(hustle, features=c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3) 

VlnPlot(hustle, features=c("percent.ribo")) 

ggplot(hustle@meta.data, aes(x = nFeature_RNA)) + geom_histogram(binwidth = 50)

#ggplot(mono@meta.data, aes(x = nFeature_RNA)) + geom_histogram(binwidth = 20) + xlim(c(0, 600))

ggplot(hustle@meta.data, aes(x = nFeature_RNA)) + 
  geom_histogram(binwidth = 50) +
  facet_wrap( ~ donor_id)

ggplot(hustle@meta.data, aes(x = nCount_RNA)) + geom_histogram(binwidth = 50)

ggplot(hustle@meta.data, aes(x = nCount_RNA)) + 
  geom_histogram(binwidth = 50) +
  facet_wrap( ~ donor_id)

#ggplot(mono@meta.data, aes(x = nCount_RNA)) + geom_histogram(binwidth = 50) + xlim(c(0, 5000))

ggplot(hustle@meta.data, aes(x = percent.mt)) + geom_histogram(binwidth = 1)

ggplot(hustle@meta.data, aes(x = percent.mt)) + 
  geom_histogram(binwidth = 1) +
  facet_wrap( ~ donor_id)

ggplot(hustle@meta.data, aes(x = percent.ribo)) + geom_histogram(binwidth = 1)

ggplot(hustle@meta.data, aes(x = percent.ribo)) + 
  geom_histogram(binwidth = 1) +
  facet_wrap( ~ donor_id)

#ggplot(mono@meta.data, aes(x = percent.Ribosomal)) + geom_histogram(binwidth = 1)

#ggplot(mono@meta.data, aes(x = percent.Largest.Gene)) + geom_histogram(binwidth = 1)
deadcell_nF_lowercutoff <- 150
deadcell_nF_uppercutoff <- 1200
#deadcell_nC_lowercutoff <- 500
weirdcell_ribo_uppercutoff <- 40
deadcell_mt_cutoff <- 10

hustle.meta <- hustle@meta.data %>% 
  mutate(is.dead = case_when(
      nFeature_RNA >= deadcell_nF_lowercutoff & 
        nFeature_RNA <= deadcell_nF_uppercutoff &
        percent.ribo <= weirdcell_ribo_uppercutoff &
        #nCount_RNA >= deadcell_nC_lowercutoff &
        percent.mt <= deadcell_mt_cutoff ~ "FALSE",
      TRUE ~ "TRUE"))

table(hustle.meta$is.dead)
## 
## FALSE  TRUE 
##  1035  1428
ggplot(hustle.meta, aes(x = nFeature_RNA, fill = factor(is.dead))) +
  geom_histogram(bins = 50, position = "identity", alpha = 0.5)

ggplot(hustle.meta, aes(x = nCount_RNA, fill = factor(is.dead))) +
  geom_histogram(bins = 50, position = "identity", alpha = 0.5)

ggplot(hustle.meta, aes(x = percent.mt, fill = factor(is.dead))) +
  geom_histogram(bins = 50, position = "identity", alpha = 0.5)

ggplot(hustle.meta, aes(x = percent.ribo, fill = factor(is.dead))) +
  geom_histogram(bins = 50, position = "identity", alpha = 0.5)

hustle1 <- subset(hustle, subset =  nFeature_RNA >= deadcell_nF_lowercutoff & 
                    nFeature_RNA <= deadcell_nF_uppercutoff & 
                    percent.mt <= deadcell_mt_cutoff & 
                    #nCount_RNA >= deadcell_nC_lowercutoff & 
                    percent.ribo <= weirdcell_ribo_uppercutoff) #nCount_RNA >= 1000 &
hustle1
## An object of class Seurat 
## 14441 features across 1035 samples within 2 assays 
## Active assay: RNA (14430 features, 0 variable features)
##  1 layer present: counts
##  1 other assay present: HTO
##  2 dimensional reductions calculated: pca, tsne
VlnPlot(hustle1, features=c("nCount_RNA", "nFeature_RNA", "percent.mt", "percent.ribo"), group.by = "orig.ident", ncol = 2) 

hustle = hustle1

Assigning hashtags and peptide pools to cells

# assigning cell to donor and pool

hustle@meta.data <- hustle@meta.data %>% 
  mutate(Donor_expt = case_when(
    HTO_classification == "TotalSeq-C-2" ~ "D33",
    HTO_classification == "TotalSeq-C-3" ~ "D45",
    HTO_classification == "TotalSeq-C-4" ~ "D47",
    .default = ""
  )) %>% 
  mutate(pep.pool = case_when(
    HTO_classification == "TotalSeq-C-5" ~ "P2",
    HTO_classification == "TotalSeq-C-6" ~ "P3",
    HTO_classification == "TotalSeq-C-7" ~ "P4",
    HTO_classification == "TotalSeq-C-8" ~ "P5",
    HTO_classification == "TotalSeq-C-9" ~ "P6",
    HTO_classification == "TotalSeq-C-10" ~ "P7",
    HTO_classification == "TotalSeq-C-11" ~ "P8",
    HTO_classification == "TotalSeq-C-12" ~ "P9",
    HTO_classification == "TotalSeq-C-2" ~ "P1",
    HTO_classification == "TotalSeq-C-3" ~ "P1",
    HTO_classification == "TotalSeq-C-4" ~ "P1",
    .default = ""
  ))

# get donor id from SNP for assigning D33, D45, D47

donorid_snp_hto <- hustle@meta.data %>% 
  select(Donor_expt, donor_id) %>% 
  filter(Donor_expt != "") %>% 
  filter(!donor_id %in% c("doublet", "unassigned")) %>% 
  group_by(donor_id) %>% 
  count(Donor_expt, donor_id)

#Put them back in hustle meta data

hustle@meta.data <- hustle@meta.data %>% 
  mutate(Donor_expt = case_when(
    HTO_classification == "TotalSeq-C-2" ~ "D33",
    HTO_classification == "TotalSeq-C-3" ~ "D45",
    HTO_classification == "TotalSeq-C-4" ~ "D47",
    donor_id == "donor0" ~ "D33",
    donor_id == "donor1" ~ "D45",
    donor_id == "donor2" ~ "D47",
    .default = ""
  )) %>% 
  filter(!is.na(donor_id)) %>% 
  mutate(donor_pep.pool = paste(Donor_expt, pep.pool, sep = "_"))

# # remove left over unassigned and doublet "cells"
# Idents(hustle) <- c("donor_id")
# hustle <- subset(hustle, idents = c("doublet", "unassigned"), invert = T)

table(hustle$Donor_expt)
## 
## D33 D45 D47 
## 104 148 769
table(hustle$pep.pool)
## 
##  P1  P2  P3  P4  P5  P6  P7  P8  P9 
## 226 150 127 120  48  95 109  37 109
table(hustle$Donor_expt, hustle$pep.pool)
##      
##        P1  P2  P3  P4  P5  P6  P7  P8  P9
##   D33  19  19   8  11   9   9   6  11  12
##   D45  42  15  13   8  27  17   6  13   7
##   D47 165 116 106 101  12  69  97  13  90
table(hustle$donor_pep.pool)
## 
## D33_P1 D33_P2 D33_P3 D33_P4 D33_P5 D33_P6 D33_P7 D33_P8 D33_P9 D45_P1 D45_P2 
##     19     19      8     11      9      9      6     11     12     42     15 
## D45_P3 D45_P4 D45_P5 D45_P6 D45_P7 D45_P8 D45_P9 D47_P1 D47_P2 D47_P3 D47_P4 
##     13      8     27     17      6     13      7    165    116    106    101 
## D47_P5 D47_P6 D47_P7 D47_P8 D47_P9 
##     12     69     97     13     90

Cells with assigned clonotypes

add_clonotype <- function(seurat_obj){
    tcr <- read.csv("../Data/D33_D45_D47_CD4_Tcells/filtered_contig_annotations.csv")
    # Clonotype-centric info.
    clono <- read.csv("../Data/D33_D45_D47_CD4_Tcells/clonotypes.csv")

    # Subsets so only the first line of each barcode is kept,
    # as each entry for given barcode will have same clonotype.
    tcr <- tcr %>% 
      distinct(barcode, .keep_all = T) %>% 
      # Only keep the barcode and clonotype columns. 
      # We'll get additional clonotype info from the clonotype table.
      select(barcode, raw_clonotype_id) %>% 
      rename(clonotype_id = raw_clonotype_id) %>% 
      # Slap the AA sequences onto our original table by clonotype_id.
      merge(., clono[, c("clonotype_id", "cdr3s_aa")]) %>% 
      tibble::column_to_rownames("barcode")
    
    seurat_obj <- seurat_obj[,rownames(tcr)]
    
    # Add to the Seurat object's metadata.
    clono_seurat <- AddMetaData(object=seurat_obj, metadata=tcr)
    return(clono_seurat)
}

Idents(hustle) <- rownames(hustle@meta.data)
hustle <- add_clonotype(hustle)

table(!is.na(hustle$clonotype_id))
## 
## TRUE 
##  707
# FALSE  TRUE 
#   515   834
# keep only the clonotypes

hustle <- subset(hustle, cells = colnames(hustle)[!is.na(hustle$clonotype_id)])
hustle
## An object of class Seurat 
## 14441 features across 707 samples within 2 assays 
## Active assay: RNA (14430 features, 0 variable features)
##  1 layer present: counts
##  1 other assay present: HTO
##  2 dimensional reductions calculated: pca, tsne
Idents(hustle) <- "HTO_maxID"
RidgePlot(hustle, assay = "HTO", features = rownames(hustle@assays[["HTO"]])[1:5])

RidgePlot(hustle, assay = "HTO", features = rownames(hustle@assays[["HTO"]])[6:11])

Variable genes, cell cycle regression; principal components from elbow plot

hustle <- NormalizeData(hustle, normalization.method = "LogNormalize", scale.factor = 10000)
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
hustle <- CellCycleScoring(hustle, g2m.features = g2m.genes, s.features = s.genes)

hustle <- FindVariableFeatures(hustle, selection.method = "vst", nfeatures = 2000)
all.genes <- rownames(hustle)
hustle <- ScaleData(hustle) #, features = all.genes
# Identify the 15 most highly variable genes
ranked_variable_genes <- VariableFeatures(hustle)
top_genes <- ranked_variable_genes[1:15]

# Plot the average expression and variance of these genes
# With labels to indicate which genes are in the top 15
p <- VariableFeaturePlot(hustle)
LabelPoints(plot = p, points = top_genes, repel = TRUE)

hustle <- RunPCA(hustle, features = VariableFeatures(object = hustle))
DimPlot(hustle,
        reduction = "pca",
        group.by= "Phase")

hustle <- ScaleData(hustle, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(hustle))
hustle <- RunPCA(hustle, features = VariableFeatures(object = hustle), npcs = 15, nfeatures.print = 10)
hustle <- RunPCA(hustle, features = c(s.genes, g2m.genes), npcs = 15)
Idents(hustle) <- "Phase"
DimPlot(hustle)

hustle <- SCTransform(hustle, vars.to.regress = c("S.Score", "G2M.Score"))
# hustle <- JackStraw(hustle, num.replicate = 70)
# hustle <- ScoreJackStraw(hustle, dims = 1:10)
# #JackStrawPlot(hustle, dims = 1:10)
ElbowPlot(hustle)

use.pcs = 1:12
8482 -> saved.seed
#hustle <- RunTSNE(hustle, dims=use.pcs, seed.use = saved.seed, perplexity=100)
hustle <- RunPCA(hustle)
hustle <- FindNeighbors(hustle, dims = use.pcs)
hustle <- RunUMAP(hustle, dims = use.pcs, reduction = "pca")
hustle <- FindClusters(hustle, resolution = 1) #,  graph.name = "RNA_snn_res.0.9"
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 707
## Number of edges: 26011
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6084
## Number of communities: 6
## Elapsed time: 0 seconds
#DimPlot(hustle, reduction = "umap")

Genes in top N principal components

DimHeatmap(hustle, dims = 1:5, nfeatures = 30)

FeaturePlot(hustle, feature = "MKI67", reduction = "umap")

hustle <- RunAzimuth(hustle, reference = "pbmcref")
#DimPlot(hustle, reduction = "umap", group.by = "predicted.celltype.l2")

Gene markers for clusters

markers_all = FindAllMarkers(hustle,genes.use = VariableFeatures(hustle),
    only.pos = F, 
    min.pct = 0.25, 
    thresh.use = 0.25)
DT::datatable(markers_all)

Gene markers unique to each cluster

markers_all_single <- markers_all[markers_all$gene %in% names(table(markers_all$gene))[table(markers_all$gene) == 1],]
DT::datatable(markers_all_single)

Top N marker genes

top15_markers <- markers_all %>% 
  group_by(cluster) %>% 
  slice_max(n = 20, order_by = abs(avg_log2FC))

DT::datatable(top15_markers)
top10 <- markers_all %>% group_by(cluster) %>% top_n(10, avg_log2FC)
#FeaturePlot(hustle, features = top10$gene[6:10])

Visualise cells with assigned clonotypes, predicted celltypes and cell clusters

DimPlot(hustle, reduction = "umap", group.by = "clonotype_id", label = F) + NoLegend()

DimPlot(hustle, reduction = "umap", group.by = "predicted.celltype.l2", label = T)

DimPlot(hustle, reduction = "umap", label = TRUE)

Heatmap with top 10 markers

DoHeatmap(
    object = hustle, 
    features = top10$gene
) 

# write.table(top15_markers, "hustleseq_top15_markers_6clusters.txt", row.names = F, quote = F)
# write.table(markers_all_single, "hustleseq_unique_markers_6clusters.txt", row.names = F, quote = F)

Cells mapped to specific epitopes

hustle.meta <- data.frame(hustle@reductions[["umap"]]@cell.embeddings) %>% 
  tibble::rownames_to_column("Cell") %>% 
  mutate(clonotype_id = substr(hustle@meta.data$clonotype_id, 9, nchar(hustle@meta.data$clonotype_id))) %>% 
  mutate(clone_colour = case_when(
    clonotype_id == "clonotype1" ~ "cornflowerblue",
    clonotype_id == "clonotype2" ~ "cornflowerblue",
    clonotype_id == "clonotype4" ~ "salmon",
    clonotype_id == "clonotype6" ~ "lightgreen",
    clonotype_id == "clonotype9" ~ "navyblue",
    clonotype_id == "clonotype24" ~ "darkred",
    clonotype_id == "clonotype43" ~ "yellow4",
    .default = "gray80"
  )) %>% 
  mutate(peptide.pool = hustle$pep.pool) %>% 
  mutate(pep.pool.colour = case_when(
    peptide.pool == "P1" & clone_colour != "gray80" ~ "#440154",
    peptide.pool == "P2" & clone_colour != "gray80" ~ "#472d7b",
    peptide.pool == "P3" & clone_colour != "gray80" ~ "#3b528b",
    peptide.pool == "P4" & clone_colour != "gray80" ~ "#2c728e",
    peptide.pool == "P5" & clone_colour != "gray80" ~ "#21918c",
    peptide.pool == "P6" & clone_colour != "gray80" ~ "#28ae80",
    peptide.pool == "P7" & clone_colour != "gray80" ~ "#5ec962",
    peptide.pool == "P8" & clone_colour != "gray80" ~ "#addc30",
    peptide.pool == "P9" & clone_colour != "gray80" ~ "#fde725",
    .default = "gray80"
  ))

ggplot(hustle.meta, aes(umap_1, umap_2, label = clonotype_id,), colour = clonotype_id) +
  geom_point(size = 3, alpha = 0.8, colour = hustle.meta$clone_colour) +
  theme_bw() +
  theme(legend.position = "right")

Peptide pool didn’t determine phenotype

ggplot(hustle.meta, aes(umap_1, umap_2, label = peptide.pool, fill = peptide.pool, colour = peptide.pool)) +
  geom_point(size = 3, alpha = 0.8) +
  theme_bw()